1   package org.apache.lucene.search;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.io.IOException;
21  import java.util.LinkedList;
22  
23  import org.apache.lucene.analysis.CannedTokenStream;
24  import org.apache.lucene.analysis.Token;
25  import org.apache.lucene.document.Document;
26  import org.apache.lucene.document.Field;
27  import org.apache.lucene.document.TextField;
28  import org.apache.lucene.index.DirectoryReader;
29  import org.apache.lucene.index.IndexReader;
30  import org.apache.lucene.index.IndexWriter;
31  import org.apache.lucene.index.IndexWriterConfig;
32  import org.apache.lucene.index.MultiFields;
33  import org.apache.lucene.index.RandomIndexWriter;
34  import org.apache.lucene.index.Term;
35  import org.apache.lucene.index.TermsEnum;
36  import org.apache.lucene.search.similarities.DefaultSimilarity;
37  import org.apache.lucene.store.Directory;
38  import org.apache.lucene.store.RAMDirectory;
39  import org.apache.lucene.util.BytesRef;
40  import org.apache.lucene.util.LuceneTestCase;
41  import org.junit.Ignore;
42  
43  /**
44   * This class tests the MultiPhraseQuery class.
45   * 
46   * 
47   */
48  public class TestMultiPhraseQuery extends LuceneTestCase {
49    
50    public void testPhrasePrefix() throws IOException {
51      Directory indexStore = newDirectory();
52      RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
53      add("blueberry pie", writer);
54      add("blueberry strudel", writer);
55      add("blueberry pizza", writer);
56      add("blueberry chewing gum", writer);
57      add("bluebird pizza", writer);
58      add("bluebird foobar pizza", writer);
59      add("piccadilly circus", writer);
60      
61      IndexReader reader = writer.getReader();
62      IndexSearcher searcher = newSearcher(reader);
63      
64      // search for "blueberry pi*":
65      MultiPhraseQuery query1 = new MultiPhraseQuery();
66      // search for "strawberry pi*":
67      MultiPhraseQuery query2 = new MultiPhraseQuery();
68      query1.add(new Term("body", "blueberry"));
69      query2.add(new Term("body", "strawberry"));
70      
71      LinkedList<Term> termsWithPrefix = new LinkedList<>();
72      
73      // this TermEnum gives "piccadilly", "pie" and "pizza".
74      String prefix = "pi";
75      TermsEnum te = MultiFields.getFields(reader).terms("body").iterator();
76      te.seekCeil(new BytesRef(prefix));
77      do {
78        String s = te.term().utf8ToString();
79        if (s.startsWith(prefix)) {
80          termsWithPrefix.add(new Term("body", s));
81        } else {
82          break;
83        }
84      } while (te.next() != null);
85      
86      query1.add(termsWithPrefix.toArray(new Term[0]));
87      assertEquals("body:\"blueberry (piccadilly pie pizza)\"", query1.toString());
88      query2.add(termsWithPrefix.toArray(new Term[0]));
89      assertEquals("body:\"strawberry (piccadilly pie pizza)\"", query2
90          .toString());
91      
92      ScoreDoc[] result;
93      result = searcher.search(query1, 1000).scoreDocs;
94      assertEquals(2, result.length);
95      result = searcher.search(query2, 1000).scoreDocs;
96      assertEquals(0, result.length);
97      
98      // search for "blue* pizza":
99      MultiPhraseQuery query3 = new MultiPhraseQuery();
100     termsWithPrefix.clear();
101     prefix = "blue";
102     te.seekCeil(new BytesRef(prefix));
103     
104     do {
105       if (te.term().utf8ToString().startsWith(prefix)) {
106         termsWithPrefix.add(new Term("body", te.term().utf8ToString()));
107       }
108     } while (te.next() != null);
109     
110     query3.add(termsWithPrefix.toArray(new Term[0]));
111     query3.add(new Term("body", "pizza"));
112     
113     result = searcher.search(query3, 1000).scoreDocs;
114     assertEquals(2, result.length); // blueberry pizza, bluebird pizza
115     assertEquals("body:\"(blueberry bluebird) pizza\"", query3.toString());
116     
117     // test slop:
118     query3.setSlop(1);
119     result = searcher.search(query3, 1000).scoreDocs;
120     
121     // just make sure no exc:
122     searcher.explain(query3, 0);
123     
124     assertEquals(3, result.length); // blueberry pizza, bluebird pizza, bluebird
125                                     // foobar pizza
126     
127     MultiPhraseQuery query4 = new MultiPhraseQuery();
128     try {
129       query4.add(new Term("field1", "foo"));
130       query4.add(new Term("field2", "foobar"));
131       fail();
132     } catch (IllegalArgumentException e) {
133       // okay, all terms must belong to the same field
134     }
135     
136     writer.close();
137     reader.close();
138     indexStore.close();
139   }
140 
141   // LUCENE-2580
142   public void testTall() throws IOException {
143     Directory indexStore = newDirectory();
144     RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
145     add("blueberry chocolate pie", writer);
146     add("blueberry chocolate tart", writer);
147     IndexReader r = writer.getReader();
148     writer.close();
149 
150     IndexSearcher searcher = newSearcher(r);
151     MultiPhraseQuery q = new MultiPhraseQuery();
152     q.add(new Term("body", "blueberry"));
153     q.add(new Term("body", "chocolate"));
154     q.add(new Term[] {new Term("body", "pie"), new Term("body", "tart")});
155     assertEquals(2, searcher.search(q, 1).totalHits);
156     r.close();
157     indexStore.close();
158   }
159   
160   @Ignore //LUCENE-3821 fixes sloppy phrase scoring, except for this known problem 
161   public void testMultiSloppyWithRepeats() throws IOException {
162     Directory indexStore = newDirectory();
163     RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
164     add("a b c d e f g h i k", writer);
165     IndexReader r = writer.getReader();
166     writer.close();
167     
168     IndexSearcher searcher = newSearcher(r);
169     
170     MultiPhraseQuery q = new MultiPhraseQuery();
171     // this will fail, when the scorer would propagate [a] rather than [a,b],
172     q.add(new Term[] {new Term("body", "a"), new Term("body", "b")});
173     q.add(new Term[] {new Term("body", "a")});
174     q.setSlop(6);
175     assertEquals(1, searcher.search(q, 1).totalHits); // should match on "a b"
176     
177     r.close();
178     indexStore.close();
179   }
180 
181   public void testMultiExactWithRepeats() throws IOException {
182     Directory indexStore = newDirectory();
183     RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
184     add("a b c d e f g h i k", writer);
185     IndexReader r = writer.getReader();
186     writer.close();
187     
188     IndexSearcher searcher = newSearcher(r);
189     MultiPhraseQuery q = new MultiPhraseQuery();
190     q.add(new Term[] {new Term("body", "a"), new Term("body", "d")}, 0);
191     q.add(new Term[] {new Term("body", "a"), new Term("body", "f")}, 2);
192     assertEquals(1, searcher.search(q, 1).totalHits); // should match on "a b"
193     r.close();
194     indexStore.close();
195   }
196   
197   private void add(String s, RandomIndexWriter writer) throws IOException {
198     Document doc = new Document();
199     doc.add(newTextField("body", s, Field.Store.YES));
200     writer.addDocument(doc);
201   }
202   
203   public void testBooleanQueryContainingSingleTermPrefixQuery()
204       throws IOException {
205     // this tests against bug 33161 (now fixed)
206     // In order to cause the bug, the outer query must have more than one term
207     // and all terms required.
208     // The contained PhraseMultiQuery must contain exactly one term array.
209     Directory indexStore = newDirectory();
210     RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
211     add("blueberry pie", writer);
212     add("blueberry chewing gum", writer);
213     add("blue raspberry pie", writer);
214     
215     IndexReader reader = writer.getReader();
216     IndexSearcher searcher = newSearcher(reader);
217     // This query will be equivalent to +body:pie +body:"blue*"
218     BooleanQuery.Builder q = new BooleanQuery.Builder();
219     q.add(new TermQuery(new Term("body", "pie")), BooleanClause.Occur.MUST);
220     
221     MultiPhraseQuery trouble = new MultiPhraseQuery();
222     trouble.add(new Term[] {new Term("body", "blueberry"),
223         new Term("body", "blue")});
224     q.add(trouble, BooleanClause.Occur.MUST);
225     
226     // exception will be thrown here without fix
227     ScoreDoc[] hits = searcher.search(q.build(), 1000).scoreDocs;
228     
229     assertEquals("Wrong number of hits", 2, hits.length);
230     
231     // just make sure no exc:
232     searcher.explain(q.build(), 0);
233     
234     writer.close();
235     reader.close();
236     indexStore.close();
237   }
238   
239   public void testPhrasePrefixWithBooleanQuery() throws IOException {
240     Directory indexStore = newDirectory();
241     RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
242     add("This is a test", "object", writer);
243     add("a note", "note", writer);
244     
245     IndexReader reader = writer.getReader();
246     IndexSearcher searcher = newSearcher(reader);
247     
248     // This query will be equivalent to +type:note +body:"a t*"
249     BooleanQuery.Builder q = new BooleanQuery.Builder();
250     q.add(new TermQuery(new Term("type", "note")), BooleanClause.Occur.MUST);
251     
252     MultiPhraseQuery trouble = new MultiPhraseQuery();
253     trouble.add(new Term("body", "a"));
254     trouble
255         .add(new Term[] {new Term("body", "test"), new Term("body", "this")});
256     q.add(trouble, BooleanClause.Occur.MUST);
257     
258     // exception will be thrown here without fix for #35626:
259     ScoreDoc[] hits = searcher.search(q.build(), 1000).scoreDocs;
260     assertEquals("Wrong number of hits", 0, hits.length);
261     writer.close();
262     reader.close();
263     indexStore.close();
264   }
265   
266   public void testNoDocs() throws Exception {
267     Directory indexStore = newDirectory();
268     RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
269     add("a note", "note", writer);
270     
271     IndexReader reader = writer.getReader();
272     IndexSearcher searcher = newSearcher(reader);
273     
274     MultiPhraseQuery q = new MultiPhraseQuery();
275     q.add(new Term("body", "a"));
276     q.add(new Term[] {new Term("body", "nope"), new Term("body", "nope")});
277     assertEquals("Wrong number of hits", 0,
278         searcher.search(q, 1).totalHits);
279     
280     // just make sure no exc:
281     searcher.explain(q, 0);
282     
283     writer.close();
284     reader.close();
285     indexStore.close();
286   }
287   
288   public void testHashCodeAndEquals() {
289     MultiPhraseQuery query1 = new MultiPhraseQuery();
290     MultiPhraseQuery query2 = new MultiPhraseQuery();
291     
292     assertEquals(query1.hashCode(), query2.hashCode());
293     assertEquals(query1, query2);
294     
295     Term term1 = new Term("someField", "someText");
296     
297     query1.add(term1);
298     query2.add(term1);
299     
300     assertEquals(query1.hashCode(), query2.hashCode());
301     assertEquals(query1, query2);
302     
303     Term term2 = new Term("someField", "someMoreText");
304     
305     query1.add(term2);
306     
307     assertFalse(query1.hashCode() == query2.hashCode());
308     assertFalse(query1.equals(query2));
309     
310     query2.add(term2);
311     
312     assertEquals(query1.hashCode(), query2.hashCode());
313     assertEquals(query1, query2);
314   }
315   
316   private void add(String s, String type, RandomIndexWriter writer)
317       throws IOException {
318     Document doc = new Document();
319     doc.add(newTextField("body", s, Field.Store.YES));
320     doc.add(newStringField("type", type, Field.Store.NO));
321     writer.addDocument(doc);
322   }
323   
324   // LUCENE-2526
325   public void testEmptyToString() {
326     new MultiPhraseQuery().toString();
327   }
328   
329   public void testCustomIDF() throws Exception {
330     Directory indexStore = newDirectory();
331     RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
332     add("This is a test", "object", writer);
333     add("a note", "note", writer);
334     
335     IndexReader reader = writer.getReader();
336     IndexSearcher searcher = newSearcher(reader);
337     searcher.setSimilarity(new DefaultSimilarity() { 
338       @Override
339       public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
340         return Explanation.match(10f, "just a test");
341       } 
342     });
343     
344     MultiPhraseQuery query = new MultiPhraseQuery();
345     query.add(new Term[] { new Term("body", "this"), new Term("body", "that") });
346     query.add(new Term("body", "is"));
347     Weight weight = query.createWeight(searcher, true);
348     assertEquals(10f * 10f, weight.getValueForNormalization(), 0.001f);
349 
350     writer.close();
351     reader.close();
352     indexStore.close();
353   }
354 
355   public void testZeroPosIncr() throws IOException {
356     Directory dir = new RAMDirectory();
357     final Token[] tokens = new Token[3];
358     tokens[0] = new Token();
359     tokens[0].append("a");
360     tokens[0].setPositionIncrement(1);
361     tokens[1] = new Token();
362     tokens[1].append("b");
363     tokens[1].setPositionIncrement(0);
364     tokens[2] = new Token();
365     tokens[2].append("c");
366     tokens[2].setPositionIncrement(0);
367 
368     RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
369     Document doc = new Document();
370     doc.add(new TextField("field", new CannedTokenStream(tokens)));
371     writer.addDocument(doc);
372     doc = new Document();
373     doc.add(new TextField("field", new CannedTokenStream(tokens)));
374     writer.addDocument(doc);
375     IndexReader r = writer.getReader();
376     writer.close();
377     IndexSearcher s = newSearcher(r);
378     MultiPhraseQuery mpq = new MultiPhraseQuery();
379     //mpq.setSlop(1);
380 
381     // NOTE: not great that if we do the else clause here we
382     // get different scores!  MultiPhraseQuery counts that
383     // phrase as occurring twice per doc (it should be 1, I
384     // think?).  This is because MultipleTermPositions is able to
385     // return the same position more than once (0, in this
386     // case):
387     if (true) {
388       mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
389       mpq.add(new Term[] {new Term("field", "a")}, 0);
390     } else {
391       mpq.add(new Term[] {new Term("field", "a")}, 0);
392       mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
393     }
394     TopDocs hits = s.search(mpq, 2);
395     assertEquals(2, hits.totalHits);
396     assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
397     /*
398     for(int hit=0;hit<hits.totalHits;hit++) {
399       ScoreDoc sd = hits.scoreDocs[hit];
400       System.out.println("  hit doc=" + sd.doc + " score=" + sd.score);
401     }
402     */
403     r.close();
404     dir.close();
405   }
406 
407   private static Token makeToken(String text, int posIncr) {
408     final Token t = new Token();
409     t.append(text);
410     t.setPositionIncrement(posIncr);
411     return t;
412   }
413 
414   private final static Token[] INCR_0_DOC_TOKENS = new Token[] {
415     makeToken("x", 1),
416     makeToken("a", 1),
417     makeToken("1", 0),
418     makeToken("m", 1),  // not existing, relying on slop=2
419     makeToken("b", 1),
420     makeToken("1", 0),
421     makeToken("n", 1), // not existing, relying on slop=2
422     makeToken("c", 1),
423     makeToken("y", 1)
424   };
425   
426   private final static Token[] INCR_0_QUERY_TOKENS_AND = new Token[] {
427     makeToken("a", 1),
428     makeToken("1", 0),
429     makeToken("b", 1),
430     makeToken("1", 0),
431     makeToken("c", 1)
432   };
433   
434   private final static Token[][] INCR_0_QUERY_TOKENS_AND_OR_MATCH = new Token[][] {
435     { makeToken("a", 1) },
436     { makeToken("x", 1), makeToken("1", 0) },
437     { makeToken("b", 2) },
438     { makeToken("x", 2), makeToken("1", 0) },
439     { makeToken("c", 3) }
440   };
441   
442   private final static Token[][] INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN = new Token[][] {
443     { makeToken("x", 1) },
444     { makeToken("a", 1), makeToken("1", 0) },
445     { makeToken("x", 2) },
446     { makeToken("b", 2), makeToken("1", 0) },
447     { makeToken("c", 3) }
448   };
449   
450   /**
451    * using query parser, MPQ will be created, and will not be strict about having all query terms 
452    * in each position - one of each position is sufficient (OR logic)
453    */
454   public void testZeroPosIncrSloppyParsedAnd() throws IOException {
455     MultiPhraseQuery q = new MultiPhraseQuery();
456     q.add(new Term[]{ new Term("field", "a"), new Term("field", "1") }, -1);
457     q.add(new Term[]{ new Term("field", "b"), new Term("field", "1") }, 0);
458     q.add(new Term[]{ new Term("field", "c") }, 1);
459     doTestZeroPosIncrSloppy(q, 0);
460     q.setSlop(1);
461     doTestZeroPosIncrSloppy(q, 0);
462     q.setSlop(2);
463     doTestZeroPosIncrSloppy(q, 1);
464   }
465   
466   private void doTestZeroPosIncrSloppy(Query q, int nExpected) throws IOException {
467     Directory dir = newDirectory(); // random dir
468     IndexWriterConfig cfg = newIndexWriterConfig(null);
469     IndexWriter writer = new IndexWriter(dir, cfg);
470     Document doc = new Document();
471     doc.add(new TextField("field", new CannedTokenStream(INCR_0_DOC_TOKENS)));
472     writer.addDocument(doc);
473     IndexReader r = DirectoryReader.open(writer,false);
474     writer.close();
475     IndexSearcher s = newSearcher(r);
476     
477     if (VERBOSE) {
478       System.out.println("QUERY=" + q);
479     }
480     
481     TopDocs hits = s.search(q, 1);
482     assertEquals("wrong number of results", nExpected, hits.totalHits);
483     
484     if (VERBOSE) {
485       for(int hit=0;hit<hits.totalHits;hit++) {
486         ScoreDoc sd = hits.scoreDocs[hit];
487         System.out.println("  hit doc=" + sd.doc + " score=" + sd.score);
488       }
489     }
490     
491     r.close();
492     dir.close();
493   }
494 
495   /**
496    * PQ AND Mode - Manually creating a phrase query
497    */
498   public void testZeroPosIncrSloppyPqAnd() throws IOException {
499     PhraseQuery.Builder builder = new PhraseQuery.Builder();
500     int pos = -1;
501     for (Token tap : INCR_0_QUERY_TOKENS_AND) {
502       pos += tap.getPositionIncrement();
503       builder.add(new Term("field", tap.toString()), pos);
504     }
505     builder.setSlop(0);
506     doTestZeroPosIncrSloppy(builder.build(), 0);
507     builder.setSlop(1);
508     doTestZeroPosIncrSloppy(builder.build(), 0);
509     builder.setSlop(2);
510     doTestZeroPosIncrSloppy(builder.build(), 1);
511   }
512 
513   /**
514    * MPQ AND Mode - Manually creating a multiple phrase query
515    */
516   public void testZeroPosIncrSloppyMpqAnd() throws IOException {
517     final MultiPhraseQuery mpq = new MultiPhraseQuery();
518     int pos = -1;
519     for (Token tap : INCR_0_QUERY_TOKENS_AND) {
520       pos += tap.getPositionIncrement();
521       mpq.add(new Term[]{new Term("field",tap.toString())}, pos); //AND logic
522     }
523     doTestZeroPosIncrSloppy(mpq, 0);
524     mpq.setSlop(1);
525     doTestZeroPosIncrSloppy(mpq, 0);
526     mpq.setSlop(2);
527     doTestZeroPosIncrSloppy(mpq, 1);
528   }
529 
530   /**
531    * MPQ Combined AND OR Mode - Manually creating a multiple phrase query
532    */
533   public void testZeroPosIncrSloppyMpqAndOrMatch() throws IOException {
534     final MultiPhraseQuery mpq = new MultiPhraseQuery();
535     for (Token tap[] : INCR_0_QUERY_TOKENS_AND_OR_MATCH) {
536       Term[] terms = tapTerms(tap);
537       final int pos = tap[0].getPositionIncrement()-1;
538       mpq.add(terms, pos); //AND logic in pos, OR across lines 
539     }
540     doTestZeroPosIncrSloppy(mpq, 0);
541     mpq.setSlop(1);
542     doTestZeroPosIncrSloppy(mpq, 0);
543     mpq.setSlop(2);
544     doTestZeroPosIncrSloppy(mpq, 1);
545   }
546 
547   /**
548    * MPQ Combined AND OR Mode - Manually creating a multiple phrase query - with no match
549    */
550   public void testZeroPosIncrSloppyMpqAndOrNoMatch() throws IOException {
551     final MultiPhraseQuery mpq = new MultiPhraseQuery();
552     for (Token tap[] : INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN) {
553       Term[] terms = tapTerms(tap);
554       final int pos = tap[0].getPositionIncrement()-1;
555       mpq.add(terms, pos); //AND logic in pos, OR across lines 
556     }
557     doTestZeroPosIncrSloppy(mpq, 0);
558     mpq.setSlop(2);
559     doTestZeroPosIncrSloppy(mpq, 0);
560   }
561 
562   private Term[] tapTerms(Token[] tap) {
563     Term[] terms = new Term[tap.length];
564     for (int i=0; i<terms.length; i++) {
565       terms[i] = new Term("field",tap[i].toString());
566     }
567     return terms;
568   }
569   
570   public void testNegativeSlop() throws Exception {
571     MultiPhraseQuery query = new MultiPhraseQuery();
572     query.add(new Term("field", "two"));
573     query.add(new Term("field", "one"));
574     try {
575       query.setSlop(-2);
576       fail("didn't get expected exception");
577     } catch (IllegalArgumentException expected) {
578       // expected exception
579     }
580   }
581   
582 }